from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from IPython.display import set_matplotlib_formats
from sklearn.preprocessing import MinMaxScaler
set_matplotlib_formats('jpg')
warnings.filterwarnings('ignore')
%matplotlib inline
data = pd.read_csv('../data/diamonds_train.csv')
data.head(10)
data.info()
data.describe().T
fig, ax = plt.subplots(ncols=3, nrows=1, figsize=(30, 7))
corr_methods = ['pearson', 'kendall', 'spearman']
for i in range(len(corr_methods)):
sns.heatmap(data.corr(method=corr_methods[i]), annot=True, fmt='.2f', ax=ax[i]);
ax[i].set_title(f'{corr_methods[i].upper()} Correlation')
plt.show()
sns.pairplot(data)
plt.show()
We can see some correlation between different properties in a general way
fig, axes = plt.subplots(ncols=5, nrows=1, figsize=(20, 5))
ax1 = axes[0]
ax2 = axes[1]
ax3 = axes[2]
ax4 = axes[3]
ax5 = axes[4]
for ax, column in [[ax1, 'carat'], [ax2, 'depth'], [ax3, 'x'], [ax4, 'y'], [ax5, 'z']]:
ax.scatter(data[column], data['price'])
ax.set_title(f'{column} vs price')
ax.set_xlabel(column)
ax.set_ylabel('Price')
plt.tight_layout()
Some correlations are as we expected to be, the higher, the more expensive.
Anyway, some diamonds have x, y, or z values equal to 0. It is necessary to clean those values to have a more accurate analysis
filter_x = data['x'] != 0
filter_y_01 = data['y'] != 0
filter_y_02 = data['y'] < 40
filter_z = data['z'] != 0
data_clean = data[filter_x & filter_y_01 & filter_y_02 & filter_z]
fig, axes = plt.subplots(ncols=5, nrows=1, figsize=(20, 5))
ax1 = axes[0]
ax2 = axes[1]
ax3 = axes[2]
ax4 = axes[3]
ax5 = axes[4]
for ax, column in [[ax1, 'carat'], [ax2, 'depth'], [ax3, 'x'], [ax4, 'y'], [ax5, 'z']]:
ax.scatter(data_clean[column], data_clean['price'])
ax.set_title(f'{column} vs price')
ax.set_xlabel(column)
ax.set_ylabel('Price')
plt.tight_layout()
Now the data seems to be more consistent.
Let's have a look at the values with very low depth and price between 2500 and 5000. Are they correct?
filter_depth = data_clean['depth'] < 50
data_clean[filter_depth]
First of all let's see if the depth value is correct.
We know depth is z/[(x+y)/2]
data_clean['depth_calc'] = data_clean['z'] / ((data_clean['x'] + data_clean['y'])/2)
data_clean.head()
data_clean[filter_depth]
We can see that depth values do not match with the calculated parameter depth_calc.
We will erase these 3 diamonds to have a more accurate analysis and reset the index.
data_clean.reset_index().tail(5)
data_clean.drop([242, 3205, 30278], inplace=True)
data_clean.reset_index().tail(5)
Let's plot again the graphs to see what we have now
fig, axes = plt.subplots(ncols=5, nrows=1, figsize=(20, 5))
ax1 = axes[0]
ax2 = axes[1]
ax3 = axes[2]
ax4 = axes[3]
ax5 = axes[4]
for ax, column in [[ax1, 'carat'], [ax2, 'depth'], [ax3, 'x'], [ax4, 'y'], [ax5, 'z']]:
ax.scatter(data_clean[column], data_clean['price'])
ax.set_title(f'{column} vs price')
ax.set_xlabel(column)
ax.set_ylabel('Price')
plt.tight_layout()
Now let's analyze diamonds with z value very low and price between 5000 and 10000
filter_z = data_clean['z'] < 2
data_clean[filter_z]
We can see again that depth values do not match with the calculated parameter depth_calc.
Let's erase these values.
data_clean.drop([6688, 13044], inplace=True)
data_clean.reset_index().tail(5)
Let's plot again the graphs to see what we have now
fig, axes = plt.subplots(ncols=5, nrows=1, figsize=(20, 5))
ax1 = axes[0]
ax2 = axes[1]
ax3 = axes[2]
ax4 = axes[3]
ax5 = axes[4]
for ax, column in [[ax1, 'carat'], [ax2, 'depth'], [ax3, 'x'], [ax4, 'y'], [ax5, 'z']]:
ax.scatter(data_clean[column], data_clean['price'])
ax.set_title(f'{column} vs price')
ax.set_xlabel(column)
ax.set_ylabel('Price')
plt.tight_layout()
Let's have a look again at depth values over 75
filter_depth = data_clean['depth'] > 75
data_clean[filter_depth]
Let's compare with other diamonds with the same characteristics
data_clean[(data_clean['price'] > 2400) & (data_clean['price'] < 2700) & (data_clean['cut'] == 'Fair') & \
(data_clean['color'] == 'E') & (data_clean['clarity'] == 'VS2')]
From the analysis we conclude that it could be a correct value. If a diamond is too long (high depth% value) the price decreases
cut_types = list(data_clean['cut'].unique())
color_types = list(data_clean['color'].unique())
clarity_types = list(data_clean['clarity'].unique())
cut_dict = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
color_dict = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
clarity_dict = {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}
def cut(cut_value):
return cut_dict[cut_value]
def color(color_value):
return color_dict[color_value]
def clarity(clarity_value):
return clarity_dict[clarity_value]
data_clean['cut_calc'] = data_clean.apply(lambda x: cut(x['cut']), axis=1)
data_clean['color_calc'] = data_clean.apply(lambda x: color(x['color']), axis=1)
data_clean['clarity_calc'] = data_clean.apply(lambda x: clarity(x['clarity']), axis=1)
data_clean['volume_calc'] = data_clean['x'] * data_clean['y'] * data_clean['z']
data_clean['density_calc'] = data_clean['carat'] / data_clean['volume_calc']
data_clean.head()
fig, ax = plt.subplots(ncols=3, nrows=1, figsize=(30, 7))
corr_methods = ['pearson', 'kendall', 'spearman']
for i in range(len(corr_methods)):
sns.heatmap(data_clean.corr(method=corr_methods[i]), annot=True, fmt='.2f', ax=ax[i]);
ax[i].set_title(f'{corr_methods[i].upper()} Correlation')
plt.show()
data_clean.corr(method='spearman').style.background_gradient(cmap='coolwarm').set_precision(2)
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(20, 10))
for i in range(len(cut_types)):
sns.regplot(x='carat', y='price', scatter=True, data=data_clean[data_clean['cut_calc'] == i+1])
ax.set_title(f'Carat vs Price by CUT')
ax.legend(['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'])
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(20, 10))
for i in range(len(color_types)):
sns.regplot(x='carat', y='price', scatter=True, data=data_clean[data_clean['color_calc'] == i+1])
ax.set_title(f'Carat vs Price by COLOR')
ax.legend(['J', 'I', 'H', 'G', 'F', 'E', 'D'])
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))
ax1 = ax[0]
ax2 = ax[1]
for i in range(len(clarity_types)):
sns.regplot(x='carat', y='price', scatter=True, data=data_clean[data_clean['clarity_calc'] == i+1], ax=ax1)
for i in range(len(clarity_types)):
sns.regplot(x='volume_calc', y='price', scatter=True, data=data_clean[data_clean['clarity_calc'] == i+1], ax=ax2)
ax1.set_title(f'Carat vs Price by CLARITY')
ax1.legend(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])
ax2.set_title(f'Volume vs Price by CLARITY')
ax2.legend(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))
ax1 = ax[0]
ax2 = ax[1]
for i in range(len(clarity_types)):
sns.regplot(x='carat', y='price', scatter=True, data=data_clean[data_clean['clarity_calc'] == i+1], ax=ax1)
for i in range(len(clarity_types)):
sns.regplot(x='volume_calc', y='price', scatter=True, data=data_clean[data_clean['clarity_calc'] == i+1], ax=ax2)
ax1.set_title(f'Carat vs Price by CLARITY (log y-axis scale)')
ax1.legend(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])
ax2.set_title(f'Volume vs Price by CLARITY (log y-axis scale)')
ax2.legend(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])
ax1.set_yscale('log')
ax1.set_ylim(300, 20000)
ax2.set_yscale('log')
ax2.set_ylim(300, 20000)
ax2.set_xlim(0, 700)
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(ncols=2, nrows=len(cut_types), figsize=(20, 25))
for i in range(len(cut_types)):
for j in range(len(clarity_types)):
sns.regplot(x='carat', y='price', scatter=True, data=data_clean[(data_clean['clarity_calc'] == j+1) & (data_clean['cut_calc'] == i+1)], ax = ax[i][0])
ax[i][0].set_ylim(300, 20000)
ax[i][0].set_xlim(0, 4.5)
ax[i][0].set_title(f'Cut {list(cut_dict.keys())[i]} - Carat vs Price by CLARITY')
ax[i][0].legend(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])
for j in range(len(clarity_types)):
sns.regplot(x='volume_calc', y='price', scatter=True, data=data_clean[(data_clean['clarity_calc'] == j+1) & (data_clean['cut_calc'] == i+1)], ax = ax[i][1])
ax[i][1].set_ylim(300, 20000)
ax[i][1].set_xlim(0, 700)
ax[i][1].set_title(f'Cut {list(cut_dict.keys())[i]} - Volume vs Price by CLARITY')
ax[i][1].legend(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(ncols=2, nrows=len(color_types), figsize=(20, 45))
for i in range(len(color_types)):
for j in range(len(clarity_types)):
sns.regplot(x='carat', y='price', scatter=True, data=data_clean[(data_clean['clarity_calc'] == j+1) & (data_clean['color_calc'] == i+1)],
ax = ax[i][0], order=3)
ax[i][0].set_ylim(300, 20000)
ax[i][0].set_xlim(0, 4.5)
ax[i][0].set_title(f'Color "{list(color_dict.keys())[i]}" - Carat vs Price by CLARITY')
ax[i][0].legend(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])
for j in range(len(clarity_types)):
sns.regplot(x='volume_calc', y='price', scatter=True, data=data_clean[(data_clean['clarity_calc'] == j+1) & (data_clean['color_calc'] == i+1)],
ax = ax[i][1], order=3)
ax[i][1].set_ylim(300, 20000)
ax[i][1].set_xlim(0, 700)
ax[i][1].set_title(f'Color "{list(color_dict.keys())[i]}" - Volume vs Price by CLARITY')
ax[i][1].legend(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])
plt.tight_layout()
plt.show()
sns.pairplot(data_clean[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z', 'volume_calc', 'density_calc']])
plt.show()
scaler = MinMaxScaler()
data_clean[['cut_calc','color_calc','clarity_calc']] = scaler.fit_transform(data_clean[['cut_calc','color_calc','clarity_calc']])
data_clean['score'] = round(3*data_clean['cut_calc'] + 3*data_clean['color_calc'] + 4*data_clean['clarity_calc'])
data_clean.head()
fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(20, 20))
ax1 = ax[0]
ax2 = ax[1]
for i in range(2):
for j in range(11):
sns.regplot(x='carat', y='price', scatter=True, data=data_clean[data_clean['score'] == j], ax=ax[i][0])
ax[i][0].set_ylim(300, 20000)
ax[i][0].legend(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
for j in range(11):
sns.regplot(x='volume_calc', y='price', scatter=True, data=data_clean[data_clean['score'] == j], ax=ax[i][1])
ax[i][1].set_ylim(300, 20000)
ax[i][1].set_xlim(0, 700)
ax[i][1].legend(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
ax[0][0].set_title(f'Carat vs Price by SCORE')
ax[0][1].set_title(f'Volume vs Price by SCORE')
ax[1][0].set_title(f'Carat vs Price by SCORE (log y-axis scale)')
ax[1][1].set_title(f'Volume vs Price by SCORE (log y-axis scale)')
ax[i][0].set_yscale('log')
ax[i][1].set_yscale('log')
plt.tight_layout()
plt.show()
data_clean.to_csv('../data/data_clean.csv')